Estimating a continuous target with Regression and Regression Tree¶
In [1]:
# === CELL 0: imports & plotting style (drop-in) ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
pd.set_option('display.float_format', lambda x: f'{x:,.3f}')
sns.set(context="notebook", style="whitegrid")
In [2]:
df=pd.read_csv('/Users/connorross/Downloads/Movie_regression.csv', index_col=0)
1. EDA ---------------------------------------------------------------------------¶
In [3]:
df.shape
Out[3]:
(506, 17)
In [4]:
df.head() # if want to drop a column then df.drop(["columnname"], axis=1, inplace=True)
Out[4]:
| Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | 3D_available | Time_taken | Twitter_hastags | Genre | Avg_age_actors | Num_multiplex | Collection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Marketing expense | |||||||||||||||||
| 20.126 | 59.620 | 0.462 | 36,524.125 | 138.700 | 7.825 | 8.095 | 7.910 | 7.995 | 7.940 | 527367 | YES | 109.600 | 223.840 | Thriller | 23 | 494 | 48000 |
| 20.546 | 69.140 | 0.531 | 35,668.655 | 152.400 | 7.505 | 7.650 | 7.440 | 7.470 | 7.440 | 494055 | NO | 146.640 | 243.456 | Drama | 42 | 462 | 43200 |
| 20.546 | 69.140 | 0.531 | 39,912.675 | 134.600 | 7.485 | 7.570 | 7.495 | 7.515 | 7.440 | 547051 | NO | 147.880 | 2,022.400 | Comedy | 38 | 458 | 69400 |
| 20.647 | 59.360 | 0.542 | 38,873.890 | 119.300 | 6.895 | 7.035 | 6.920 | 7.020 | 8.260 | 516279 | YES | 185.360 | 225.344 | Drama | 45 | 472 | 66800 |
| 21.381 | 59.360 | 0.542 | 39,701.585 | 127.700 | 6.920 | 7.070 | 6.815 | 7.070 | 8.260 | 531448 | NO | 176.480 | 225.792 | Drama | 55 | 395 | 72400 |
In [5]:
df.isnull().sum() # if wish to drop nulls then df.dropna(inplace=True)
Out[5]:
Production expense 0 Multiplex coverage 0 Budget 0 Movie_length 0 Lead_ Actor_Rating 0 Lead_Actress_rating 0 Director_rating 0 Producer_rating 0 Critic_rating 0 Trailer_views 0 3D_available 0 Time_taken 12 Twitter_hastags 0 Genre 0 Avg_age_actors 0 Num_multiplex 0 Collection 0 dtype: int64
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 506 entries, 20.1264 to 20.9482 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Production expense 506 non-null float64 1 Multiplex coverage 506 non-null float64 2 Budget 506 non-null float64 3 Movie_length 506 non-null float64 4 Lead_ Actor_Rating 506 non-null float64 5 Lead_Actress_rating 506 non-null float64 6 Director_rating 506 non-null float64 7 Producer_rating 506 non-null float64 8 Critic_rating 506 non-null float64 9 Trailer_views 506 non-null int64 10 3D_available 506 non-null object 11 Time_taken 494 non-null float64 12 Twitter_hastags 506 non-null float64 13 Genre 506 non-null object 14 Avg_age_actors 506 non-null int64 15 Num_multiplex 506 non-null int64 16 Collection 506 non-null int64 dtypes: float64(11), int64(4), object(2) memory usage: 71.2+ KB
In [10]:
# Check missing values
print("Missing values before fill:\n", df['Time_taken'].isna().sum())
# Fill missing numeric with median (keeps distribution shape)
df['Time_taken'] = df['Time_taken'].fillna(df['Time_taken'].median())
print("Missing values after fill:\n", df['Time_taken'].isna().sum())
Missing values before fill: 12 Missing values after fill: 0
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 506 entries, 20.1264 to 20.9482 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Production expense 506 non-null float64 1 Multiplex coverage 506 non-null float64 2 Budget 506 non-null float64 3 Movie_length 506 non-null float64 4 Lead_ Actor_Rating 506 non-null float64 5 Lead_Actress_rating 506 non-null float64 6 Director_rating 506 non-null float64 7 Producer_rating 506 non-null float64 8 Critic_rating 506 non-null float64 9 Trailer_views 506 non-null int64 10 3D_available 506 non-null object 11 Time_taken 506 non-null float64 12 Twitter_hastags 506 non-null float64 13 Genre 506 non-null object 14 Avg_age_actors 506 non-null int64 15 Num_multiplex 506 non-null int64 16 Collection 506 non-null int64 dtypes: float64(11), int64(4), object(2) memory usage: 71.2+ KB
In [ ]:
df.describe()
Out[ ]:
| Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | Time_taken | Twitter_hastags | Avg_age_actors | Num_multiplex | Collection | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 506.000 | 506.000 | 506.000 | 506.000 | 506.000 | 506.000 | 506.000 | 506.000 | 506.000 | 506.000 | 494.000 | 506.000 | 506.000 | 506.000 | 506.000 |
| mean | 77.274 | 0.445 | 34,911.144 | 142.075 | 8.014 | 8.186 | 8.020 | 8.191 | 7.811 | 449,860.715 | 157.391 | 260.832 | 39.182 | 545.043 | 45,057.708 |
| std | 13.721 | 0.116 | 3,903.038 | 28.149 | 1.054 | 1.054 | 1.060 | 1.050 | 0.660 | 68,917.763 | 31.295 | 104.779 | 12.514 | 106.333 | 18,364.352 |
| min | 55.920 | 0.129 | 19,781.355 | 76.400 | 3.840 | 4.035 | 3.840 | 4.030 | 6.600 | 212,912.000 | 0.000 | 201.152 | 3.000 | 333.000 | 10,000.000 |
| 25% | 65.380 | 0.376 | 32,693.952 | 118.525 | 7.316 | 7.504 | 7.296 | 7.508 | 7.200 | 409,128.000 | 132.300 | 223.796 | 28.000 | 465.000 | 34,050.000 |
| 50% | 74.380 | 0.462 | 34,488.217 | 151.000 | 8.308 | 8.495 | 8.312 | 8.465 | 7.960 | 462,460.000 | 160.000 | 254.400 | 39.000 | 535.500 | 42,400.000 |
| 75% | 91.200 | 0.551 | 36,793.542 | 167.575 | 8.865 | 9.030 | 8.884 | 9.030 | 8.260 | 500,247.500 | 181.890 | 283.416 | 50.000 | 614.750 | 50,000.000 |
| max | 110.480 | 0.615 | 48,772.900 | 173.500 | 9.435 | 9.540 | 9.425 | 9.635 | 9.400 | 567,784.000 | 217.520 | 2,022.400 | 60.000 | 868.000 | 100,000.000 |
In [9]:
# === CELL 3: featured statistics (summary table) ===
# Numeric summary
desc_num = df.select_dtypes(include=[np.number]).describe().T
desc_num['skew'] = df.select_dtypes(include=[np.number]).skew()
desc_num['kurtosis'] = df.select_dtypes(include=[np.number]).kurtosis()
desc_num
Out[9]:
| count | mean | std | min | 25% | 50% | 75% | max | skew | kurtosis | |
|---|---|---|---|---|---|---|---|---|---|---|
| Production expense | 506.000 | 77.274 | 13.721 | 55.920 | 65.380 | 74.380 | 91.200 | 110.480 | 0.295 | -1.234 |
| Multiplex coverage | 506.000 | 0.445 | 0.116 | 0.129 | 0.376 | 0.462 | 0.551 | 0.615 | -0.729 | -0.065 |
| Budget | 506.000 | 34,911.144 | 3,903.038 | 19,781.355 | 32,693.952 | 34,488.217 | 36,793.542 | 48,772.900 | 0.404 | 1.892 |
| Movie_length | 506.000 | 142.075 | 28.149 | 76.400 | 118.525 | 151.000 | 167.575 | 173.500 | -0.599 | -0.968 |
| Lead_ Actor_Rating | 506.000 | 8.014 | 1.054 | 3.840 | 7.316 | 8.308 | 8.865 | 9.435 | -1.011 | 0.498 |
| Lead_Actress_rating | 506.000 | 8.186 | 1.054 | 4.035 | 7.504 | 8.495 | 9.030 | 9.540 | -1.007 | 0.473 |
| Director_rating | 506.000 | 8.020 | 1.060 | 3.840 | 7.296 | 8.312 | 8.884 | 9.425 | -1.004 | 0.458 |
| Producer_rating | 506.000 | 8.191 | 1.050 | 4.030 | 7.508 | 8.465 | 9.030 | 9.635 | -1.005 | 0.503 |
| Critic_rating | 506.000 | 7.811 | 0.660 | 6.600 | 7.200 | 7.960 | 8.260 | 9.400 | 0.176 | -0.752 |
| Trailer_views | 506.000 | 449,860.715 | 68,917.763 | 212,912.000 | 409,128.000 | 462,460.000 | 500,247.500 | 567,784.000 | -0.844 | 0.489 |
| Time_taken | 494.000 | 157.391 | 31.295 | 0.000 | 132.300 | 160.000 | 181.890 | 217.520 | -0.473 | 1.114 |
| Twitter_hastags | 506.000 | 260.832 | 104.779 | 201.152 | 223.796 | 254.400 | 283.416 | 2,022.400 | 13.791 | 214.232 |
| Avg_age_actors | 506.000 | 39.182 | 12.514 | 3.000 | 28.000 | 39.000 | 50.000 | 60.000 | 0.013 | -1.200 |
| Num_multiplex | 506.000 | 545.043 | 106.333 | 333.000 | 465.000 | 535.500 | 614.750 | 868.000 | 0.534 | -0.121 |
| Collection | 506.000 | 45,057.708 | 18,364.352 | 10,000.000 | 34,050.000 | 42,400.000 | 50,000.000 | 100,000.000 | 1.111 | 1.517 |
In [12]:
TARGET = 'Collection'
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df[TARGET].dropna(), kde=True, ax=axes[0])
axes[0].set_title(f'{TARGET} Distribution')
# log1p (safe if zeros present)
sns.histplot(np.log1p(df[TARGET].clip(lower=0)).dropna(), kde=True, ax=axes[1])
axes[1].set_title(f'log1p({TARGET}) Distribution')
plt.tight_layout()
skew_raw = df[TARGET].skew()
skew_log = np.log1p(df[TARGET].clip(lower=0)).skew()
print(f"Skew (raw) = {skew_raw:0.3f} | Skew (log1p) = {skew_log:0.3f}")
Skew (raw) = 1.111 | Skew (log1p) = -0.335
In [14]:
# === CELL 5: correlation matrix heatmap (numeric features only) ===
num_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(12, 10))
corr = num_df.corr(numeric_only=True)
sns.heatmap(corr, annot=False, cmap='coolwarm', center=0, square=False)
plt.title('Correlation Matrix (Numeric Features)')
plt.show()
# Optional: print top correlations with target
print("Top |corr| with target:")
print(corr[TARGET].drop(TARGET).abs().sort_values(ascending=False).head(12))
Top |corr| with target: Trailer_views 0.720 Budget 0.696 Production expense 0.485 Multiplex coverage 0.429 Num_multiplex 0.392 Movie_length 0.378 Critic_rating 0.341 Lead_ Actor_Rating 0.251 Lead_Actress_rating 0.249 Producer_rating 0.248 Director_rating 0.247 Time_taken 0.109 Name: Collection, dtype: float64
In [16]:
sns.pairplot(data=df) # makes a scatterplot matrix
Out[16]:
<seaborn.axisgrid.PairGrid at 0x31902d950>
2. Linear Regression with all data using statmodels ---------------------------------¶
You can analyze each input feature individually (nice exploration but usually skip these steps and do the full model)¶
In [11]:
# https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html
In [17]:
df_model = df.copy()
# Make sure categorical columns are of type 'category' or string
for col in ['3D_available', 'Genre']:
if col in df_model.columns:
df_model[col] = df_model[col].astype('category')
# One-hot encode (drop_first to avoid dummy trap in sklearn)
cat_cols = [c for c in ['3D_available', 'Genre'] if c in df_model.columns]
df_dum = pd.get_dummies(df_model, columns=cat_cols, drop_first=True, dtype=int)
print("Shape after dummies:", df_dum.shape)
df_dum.head()
Shape after dummies: (506, 19)
Out[17]:
| Production expense | Multiplex coverage | Budget | Movie_length | Lead_ Actor_Rating | Lead_Actress_rating | Director_rating | Producer_rating | Critic_rating | Trailer_views | Time_taken | Twitter_hastags | Avg_age_actors | Num_multiplex | Collection | 3D_available_YES | Genre_Comedy | Genre_Drama | Genre_Thriller | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Marketing expense | |||||||||||||||||||
| 20.126 | 59.620 | 0.462 | 36,524.125 | 138.700 | 7.825 | 8.095 | 7.910 | 7.995 | 7.940 | 527367 | 109.600 | 223.840 | 23 | 494 | 48000 | 1 | 0 | 0 | 1 |
| 20.546 | 69.140 | 0.531 | 35,668.655 | 152.400 | 7.505 | 7.650 | 7.440 | 7.470 | 7.440 | 494055 | 146.640 | 243.456 | 42 | 462 | 43200 | 0 | 0 | 1 | 0 |
| 20.546 | 69.140 | 0.531 | 39,912.675 | 134.600 | 7.485 | 7.570 | 7.495 | 7.515 | 7.440 | 547051 | 147.880 | 2,022.400 | 38 | 458 | 69400 | 0 | 1 | 0 | 0 |
| 20.647 | 59.360 | 0.542 | 38,873.890 | 119.300 | 6.895 | 7.035 | 6.920 | 7.020 | 8.260 | 516279 | 185.360 | 225.344 | 45 | 472 | 66800 | 1 | 0 | 1 | 0 |
| 21.381 | 59.360 | 0.542 | 39,701.585 | 127.700 | 6.920 | 7.070 | 6.815 | 7.070 | 8.260 | 531448 | 176.480 | 225.792 | 55 | 395 | 72400 | 0 | 0 | 1 | 0 |
In [18]:
# Features = all columns except target
X = df_dum.drop(columns=[TARGET])
y = df_dum[TARGET].copy()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
X_train.shape, X_test.shape
Out[18]:
((404, 18), (102, 18))
In [20]:
# Fit linear regression model
linr = LinearRegression()
linr.fit(X_train, y_train)
# Predictions
pred_tr = linr.predict(X_train)
pred_te = linr.predict(X_test)
# Metrics function (handles old sklearn versions)
def reg_metrics(y_true, y_pred, label=''):
r2 = r2_score(y_true, y_pred)
try:
# Works for sklearn >=0.22
rmse = mean_squared_error(y_true, y_pred, squared=False)
except TypeError:
# Fallback for older sklearn
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
return pd.Series({'R2': r2, 'RMSE': rmse, 'MAE': mae}, name=label)
# Evaluate on train and test sets
metrics_lr = pd.concat([
reg_metrics(y_train, pred_tr, 'Train'),
reg_metrics(y_test, pred_te, 'Test')
], axis=1)
print("Linear Regression Performance Metrics:")
display(metrics_lr)
Linear Regression Performance Metrics:
| Train | Test | |
|---|---|---|
| R2 | 0.698 | 0.609 |
| RMSE | 10,218.431 | 10,740.141 |
| MAE | 7,343.131 | 7,720.467 |
In [23]:
# === CELL 11 (Updated): statsmodels OLS with safe quoting for all column names ===
import statsmodels.formula.api as smf
# Ensure categoricals are typed as category for readability (not strictly required with C(), but nice)
for _col in ['3D_available', 'Genre']:
if _col in df.columns:
df[_col] = df[_col].astype('category')
TARGET = 'Collection'
all_cols = [c for c in df.columns if c != TARGET]
def q(col_name: str) -> str:
"""Quote a column name for Patsy when it has spaces/starts with digits."""
return f'Q("{col_name}")'
categorical_cols = [c for c in ['3D_available', 'Genre'] if c in df.columns]
# Build RHS with quoting; wrap categoricals in C()
rhs_terms = []
for c in all_cols:
if c in categorical_cols:
rhs_terms.append(f'C({q(c)})')
else:
rhs_terms.append(q(c))
formula = f'{q(TARGET)} ~ ' + ' + '.join(rhs_terms)
print("OLS formula:\n", formula)
ols_model = smf.ols(formula=formula, data=df).fit()
ols_summary = ols_model.summary()
ols_summary
OLS formula:
Q("Collection") ~ Q("Production expense") + Q("Multiplex coverage") + Q("Budget") + Q("Movie_length") + Q("Lead_ Actor_Rating") + Q("Lead_Actress_rating") + Q("Director_rating") + Q("Producer_rating") + Q("Critic_rating") + Q("Trailer_views") + C(Q("3D_available")) + Q("Time_taken") + Q("Twitter_hastags") + C(Q("Genre")) + Q("Avg_age_actors") + Q("Num_multiplex")
Out[23]:
| Dep. Variable: | Q("Collection") | R-squared: | 0.686 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.674 |
| Method: | Least Squares | F-statistic: | 59.05 |
| Date: | Wed, 15 Oct 2025 | Prob (F-statistic): | 6.67e-110 |
| Time: | 21:41:37 | Log-Likelihood: | -5392.6 |
| No. Observations: | 506 | AIC: | 1.082e+04 |
| Df Residuals: | 487 | BIC: | 1.090e+04 |
| Df Model: | 18 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -1.485e+05 | 1.67e+04 | -8.883 | 0.000 | -1.81e+05 | -1.16e+05 |
| C(Q("3D_available"))[T.YES] | 2350.3851 | 966.511 | 2.432 | 0.015 | 451.338 | 4249.432 |
| C(Q("Genre"))[T.Comedy] | 1560.6604 | 1524.385 | 1.024 | 0.306 | -1434.523 | 4555.844 |
| C(Q("Genre"))[T.Drama] | 2412.2826 | 1663.731 | 1.450 | 0.148 | -856.695 | 5681.260 |
| C(Q("Genre"))[T.Thriller] | 1929.0418 | 1494.396 | 1.291 | 0.197 | -1007.218 | 4865.301 |
| Q("Production expense") | -135.7828 | 59.491 | -2.282 | 0.023 | -252.673 | -18.893 |
| Q("Multiplex coverage") | 3.056e+04 | 1.19e+04 | 2.570 | 0.010 | 7200.431 | 5.39e+04 |
| Q("Budget") | 1.6695 | 0.158 | 10.571 | 0.000 | 1.359 | 1.980 |
| Q("Movie_length") | -25.4122 | 28.778 | -0.883 | 0.378 | -81.956 | 31.132 |
| Q("Lead_ Actor_Rating") | 4231.2170 | 7904.446 | 0.535 | 0.593 | -1.13e+04 | 1.98e+04 |
| Q("Lead_Actress_rating") | -8099.4715 | 8440.037 | -0.960 | 0.338 | -2.47e+04 | 8483.911 |
| Q("Director_rating") | 5797.2750 | 8197.711 | 0.707 | 0.480 | -1.03e+04 | 2.19e+04 |
| Q("Producer_rating") | 2800.0944 | 4407.892 | 0.635 | 0.526 | -5860.740 | 1.15e+04 |
| Q("Critic_rating") | 4216.1049 | 753.432 | 5.596 | 0.000 | 2735.725 | 5696.484 |
| Q("Trailer_views") | 0.1135 | 0.011 | 10.575 | 0.000 | 0.092 | 0.135 |
| Q("Time_taken") | 33.8507 | 15.499 | 2.184 | 0.029 | 3.398 | 64.303 |
| Q("Twitter_hastags") | 3.3355 | 4.505 | 0.740 | 0.459 | -5.516 | 12.187 |
| Q("Avg_age_actors") | 25.2103 | 38.125 | 0.661 | 0.509 | -49.699 | 100.119 |
| Q("Num_multiplex") | 8.4517 | 11.008 | 0.768 | 0.443 | -13.177 | 30.080 |
| Omnibus: | 155.065 | Durbin-Watson: | 0.950 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 591.355 |
| Skew: | 1.356 | Prob(JB): | 3.88e-129 |
| Kurtosis: | 7.549 | Cond. No. | 1.92e+07 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.92e+07. This might indicate that there are
strong multicollinearity or other numerical problems.
Create full model with all 3 predictors¶
In [19]:
y=df.sales #outcome or target
x=df[['TV','radio', 'newspaper']] #pedictor
x=sm.add_constant(x) #adds a constant term to the predictor
In [20]:
lrmodel = sm.OLS(y,x).fit()
print(lrmodel.summary())
OLS Regression Results
==============================================================================
Dep. Variable: sales R-squared: 0.897
Model: OLS Adj. R-squared: 0.896
Method: Least Squares F-statistic: 570.3
Date: Wed, 15 Oct 2025 Prob (F-statistic): 1.58e-96
Time: 20:07:38 Log-Likelihood: -386.18
No. Observations: 200 AIC: 780.4
Df Residuals: 196 BIC: 793.6
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 2.9389 0.312 9.422 0.000 2.324 3.554
TV 0.0458 0.001 32.809 0.000 0.043 0.049
radio 0.1885 0.009 21.893 0.000 0.172 0.206
newspaper -0.0010 0.006 -0.177 0.860 -0.013 0.011
==============================================================================
Omnibus: 60.414 Durbin-Watson: 2.084
Prob(Omnibus): 0.000 Jarque-Bera (JB): 151.241
Skew: -1.327 Prob(JB): 1.44e-33
Kurtosis: 6.332 Cond. No. 454.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [21]:
### drop newspaper with bad p-value of .86, create new model
y=df.sales #outcome or target
x=df[['TV','radio']] #pedictor
x=sm.add_constant(x) #adds a constant term to the predictor
lrmodel = sm.OLS(y,x).fit()
print(lrmodel.summary())
OLS Regression Results
==============================================================================
Dep. Variable: sales R-squared: 0.897
Model: OLS Adj. R-squared: 0.896
Method: Least Squares F-statistic: 859.6
Date: Wed, 15 Oct 2025 Prob (F-statistic): 4.83e-98
Time: 20:07:49 Log-Likelihood: -386.20
No. Observations: 200 AIC: 778.4
Df Residuals: 197 BIC: 788.3
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 2.9211 0.294 9.919 0.000 2.340 3.502
TV 0.0458 0.001 32.909 0.000 0.043 0.048
radio 0.1880 0.008 23.382 0.000 0.172 0.204
==============================================================================
Omnibus: 60.022 Durbin-Watson: 2.081
Prob(Omnibus): 0.000 Jarque-Bera (JB): 148.679
Skew: -1.323 Prob(JB): 5.19e-33
Kurtosis: 6.292 Cond. No. 425.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [22]:
# ### Model looks good: R2 is large at 89%, F is large (not 0), the p-values for TV and Radio are below .01.
# ### Formula: is Sales = 2.9211 + TV x .0458 + Radio x .188 *** This formula is what your client is paying you for.
# ### Weaknesses: the data are a bit not normally distributed (but not terrible) - skew should be 0 but is -1.323 thus is negatively skewed (left tail long) Kurtosis should be 3 but is 6.2 which indicates a "heavy-tailed" distribution, which indicates some outliers
In [23]:
## Measure of fit performance (we will use RMSE)
from statsmodels.tools.eval_measures import rmse
ypredLR = lrmodel.predict(x)
rmse(y,ypredLR) #RMSE Root Mean Squared Error
Out[23]:
np.float64(1.6687030593661927)
3. Regression Tree using all data using sklearn DecisionTreeRegressor --------------------------¶
In [24]:
# documentation at https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
In [24]:
# Keep it simple per your constraints; depth can be capped for readability.
tree = DecisionTreeRegressor(random_state=42, max_depth=5, min_samples_leaf=5)
tree.fit(X_train, y_train)
pred_tr_tree = tree.predict(X_train)
pred_te_tree = tree.predict(X_test)
metrics_tree = pd.concat([
reg_metrics(y_train, pred_tr_tree, 'Train'),
reg_metrics(y_test, pred_te_tree, 'Test')
], axis=1)
metrics_tree
Out[24]:
| Train | Test | |
|---|---|---|
| R2 | 0.865 | 0.805 |
| RMSE | 6,839.681 | 7,593.442 |
| MAE | 4,665.673 | 5,755.941 |
In [25]:
# === CELL 15: Tree Feature Importances ===
imp = pd.Series(tree.feature_importances_, index=X.columns)
imp_sorted = imp.sort_values(ascending=False)
imp_sorted.head(20)
Out[25]:
Budget 0.681 Trailer_views 0.199 Lead_Actress_rating 0.038 Director_rating 0.031 Critic_rating 0.017 Avg_age_actors 0.009 Twitter_hastags 0.007 Movie_length 0.007 Producer_rating 0.006 Production expense 0.005 Time_taken 0.000 Multiplex coverage 0.000 Lead_ Actor_Rating 0.000 Num_multiplex 0.000 3D_available_YES 0.000 Genre_Comedy 0.000 Genre_Drama 0.000 Genre_Thriller 0.000 dtype: float64
visualize in tree form¶
In [26]:
plt.figure(figsize=(18, 8))
plot_tree(tree, feature_names=X.columns, filled=True, rounded=True, fontsize=8)
plt.title('Decision Tree (depth-capped)')
plt.show()
In [32]:
# === TEXT TREE (fixed): avoid name collision and print rules ===
# If you previously did: from sklearn import tree
# either remove that import, or alias it:
import sklearn.tree as sktree # safe alias if you still want the module
# Your fitted model earlier was named `tree`; let's rename it once to avoid collisions:
# If you *still* have the fitted model in memory as `tree` (estimator), do this:
try:
# is it an estimator?
from sklearn.tree import DecisionTreeRegressor
if isinstance(tree, DecisionTreeRegressor):
dt_model = tree
else:
raise TypeError("`tree` is not an estimator; re-fitting below.")
except Exception:
# Refit quickly if needed (in case `tree` got overwritten by the module)
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42, max_depth=5, min_samples_leaf=5)
dt_model.fit(X_train, y_train)
# Now safely export text using the function, not the module
from sklearn.tree import export_text
text_rules = export_text(dt_model, feature_names=list(X.columns), decimals=2, show_weights=False)
print(text_rules)
# Optional shallow view for report
text_rules_shallow = export_text(dt_model, feature_names=list(X.columns), decimals=2, max_depth=3)
print("\n--- Shallow tree (max_depth=3) ---\n")
print(text_rules_shallow)
# Quick summary
print("\nTree depth:", dt_model.get_depth(), " | leaves:", dt_model.get_n_leaves())
|--- Budget <= 37982.31 | |--- Trailer_views <= 440392.00 | | |--- Director_rating <= 8.79 | | | |--- Movie_length <= 153.55 | | | | |--- Critic_rating <= 7.81 | | | | | |--- value: [38822.22] | | | | |--- Critic_rating > 7.81 | | | | | |--- value: [44218.18] | | | |--- Movie_length > 153.55 | | | | |--- Producer_rating <= 8.55 | | | | | |--- value: [31233.33] | | | | |--- Producer_rating > 8.55 | | | | | |--- value: [36230.00] | | |--- Director_rating > 8.79 | | | |--- Trailer_views <= 386160.50 | | | | |--- Critic_rating <= 8.02 | | | | | |--- value: [21186.05] | | | | |--- Critic_rating > 8.02 | | | | | |--- value: [30088.89] | | | |--- Trailer_views > 386160.50 | | | | |--- Lead_Actress_rating <= 9.22 | | | | | |--- value: [29900.00] | | | | |--- Lead_Actress_rating > 9.22 | | | | | |--- value: [37771.43] | |--- Trailer_views > 440392.00 | | |--- Lead_Actress_rating <= 9.29 | | | |--- Budget <= 36346.37 | | | | |--- Budget <= 33677.19 | | | | | |--- value: [40854.24] | | | | |--- Budget > 33677.19 | | | | | |--- value: [45274.42] | | | |--- Budget > 36346.37 | | | | |--- Production expense <= 62.22 | | | | | |--- value: [60516.67] | | | | |--- Production expense > 62.22 | | | | | |--- value: [51756.52] | | |--- Lead_Actress_rating > 9.29 | | | |--- value: [75240.00] |--- Budget > 37982.31 | |--- Budget <= 41312.54 | | |--- Trailer_views <= 474030.50 | | | |--- value: [49114.29] | | |--- Trailer_views > 474030.50 | | | |--- Avg_age_actors <= 36.50 | | | | |--- Budget <= 39348.84 | | | | | |--- value: [57757.14] | | | | |--- Budget > 39348.84 | | | | | |--- value: [65928.57] | | | |--- Avg_age_actors > 36.50 | | | | |--- Producer_rating <= 7.05 | | | | | |--- value: [66566.67] | | | | |--- Producer_rating > 7.05 | | | | | |--- value: [76022.22] | |--- Budget > 41312.54 | | |--- Critic_rating <= 7.48 | | | |--- value: [74800.00] | | |--- Critic_rating > 7.48 | | | |--- Twitter_hastags <= 227.31 | | | | |--- value: [81200.00] | | | |--- Twitter_hastags > 227.31 | | | | |--- Producer_rating <= 8.65 | | | | | |--- value: [93025.00] | | | | |--- Producer_rating > 8.65 | | | | | |--- value: [99700.00] --- Shallow tree (max_depth=3) --- |--- Budget <= 37982.31 | |--- Trailer_views <= 440392.00 | | |--- Director_rating <= 8.79 | | | |--- Movie_length <= 153.55 | | | | |--- truncated branch of depth 2 | | | |--- Movie_length > 153.55 | | | | |--- truncated branch of depth 2 | | |--- Director_rating > 8.79 | | | |--- Trailer_views <= 386160.50 | | | | |--- truncated branch of depth 2 | | | |--- Trailer_views > 386160.50 | | | | |--- truncated branch of depth 2 | |--- Trailer_views > 440392.00 | | |--- Lead_Actress_rating <= 9.29 | | | |--- Budget <= 36346.37 | | | | |--- truncated branch of depth 2 | | | |--- Budget > 36346.37 | | | | |--- truncated branch of depth 2 | | |--- Lead_Actress_rating > 9.29 | | | |--- value: [75240.00] |--- Budget > 37982.31 | |--- Budget <= 41312.54 | | |--- Trailer_views <= 474030.50 | | | |--- value: [49114.29] | | |--- Trailer_views > 474030.50 | | | |--- Avg_age_actors <= 36.50 | | | | |--- truncated branch of depth 2 | | | |--- Avg_age_actors > 36.50 | | | | |--- truncated branch of depth 2 | |--- Budget > 41312.54 | | |--- Critic_rating <= 7.48 | | | |--- value: [74800.00] | | |--- Critic_rating > 7.48 | | | |--- Twitter_hastags <= 227.31 | | | | |--- value: [81200.00] | | | |--- Twitter_hastags > 227.31 | | | | |--- truncated branch of depth 2 Tree depth: 5 | leaves: 22
In [29]:
# === CELL 17 (Updated): compare models (sklearn LR vs Tree, compat with older sklearn) ===
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
def rmse_compat(y_true, y_pred):
"""RMSE that works on old/new scikit-learn."""
try:
return mean_squared_error(y_true, y_pred, squared=False) # sklearn >= 0.22
except TypeError:
return np.sqrt(mean_squared_error(y_true, y_pred)) # fallback
summary = pd.DataFrame({
'LR_Test_R2': [r2_score(y_test, pred_te)],
'LR_Test_RMSE': [rmse_compat(y_test, pred_te)],
'LR_Test_MAE': [mean_absolute_error(y_test, pred_te)],
'Tree_Test_R2': [r2_score(y_test, pred_te_tree)],
'Tree_Test_RMSE': [rmse_compat(y_test, pred_te_tree)],
'Tree_Test_MAE': [mean_absolute_error(y_test, pred_te_tree)],
})
# If you ran the optional log1p model, add its back-transformed metrics (handle Series or DataFrame cases)
try:
if 'metrics_lr_log' in globals():
if isinstance(metrics_lr_log, pd.Series):
summary['LR_log1p_Test_R2(back)'] = [metrics_lr_log['R2']]
summary['LR_log1p_Test_RMSE(back)'] = [metrics_lr_log['RMSE']]
summary['LR_log1p_Test_MAE(back)'] = [metrics_lr_log['MAE']]
elif isinstance(metrics_lr_log, pd.DataFrame):
s = metrics_lr_log['Test (Back-Transformed)']
summary['LR_log1p_Test_R2(back)'] = [s['R2']]
summary['LR_log1p_Test_RMSE(back)'] = [s['RMSE']]
summary['LR_log1p_Test_MAE(back)'] = [s['MAE']]
except Exception:
pass
print("Model comparison (lower RMSE/MAE is better, higher R² is better):")
display(summary.T)
Model comparison (lower RMSE/MAE is better, higher R² is better):
| 0 | |
|---|---|
| LR_Test_R2 | 0.609 |
| LR_Test_RMSE | 10,740.141 |
| LR_Test_MAE | 7,720.467 |
| Tree_Test_R2 | 0.805 |
| Tree_Test_RMSE | 7,593.442 |
| Tree_Test_MAE | 5,755.941 |
Measure of fit performance (we will use RMSE)¶
In [33]:
# === CELL 18 (Updated): predict for a new (hypothetical) movie ===
# 1) Inspect the training categories so we can match them
cat_cols = [c for c in ['3D_available', 'Genre'] if c in df.columns]
cat_levels = {c: list(df[c].astype('category').cat.categories) for c in cat_cols}
print("Training categories:", cat_levels)
# 2) Define a new movie (ensure values exist in the 'Training categories' above)
new_movie = {
'Marketing expense': 2_000_000,
'Production expense': 5_500_000,
'Multiplex coverage': 180,
'Budget': 7_500_000,
'Movie_length': 115,
'Lead_ Actor_Rating': 7.5, # note the exact column name (with space after underscore)
'Lead_Actress_rating': 7.2,
'Director_rating': 8.0,
'Producer_rating': 7.0,
'Critic_rating': 6.8,
'Trailer_views': 1_200_000,
'3D_available': 'YES', # must be one of cat_levels['3D_available']
'Time_taken': 200,
'Twitter_hastags': 2_500, # exact spelling matches your CSV
'Genre': 'Action', # must be one of cat_levels['Genre']
'Avg_age_actors': 34,
'Num_multiplex': 190
}
new_df = pd.DataFrame([new_movie])
# 3) Force the same categorical levels as training (prevents dummy misalignment)
for c in cat_cols:
if c in new_df.columns:
new_df[c] = pd.Categorical(new_df[c], categories=cat_levels[c])
# 4) One-hot encode using the same approach as training
new_dum = pd.get_dummies(new_df, columns=cat_cols, drop_first=True, dtype=int)
# 5) Align columns with the training design matrix X (fills missing dummies with 0)
new_dum_aligned = new_dum.reindex(columns=X.columns, fill_value=0)
# 6) Predict with both models
lr_pred_val = linr.predict(new_dum_aligned)[0]
# If you kept the tree as `tree`, use that; if you renamed per earlier fix, use `dt_model`
try:
tree_pred_val = dt_model.predict(new_dum_aligned)[0]
except NameError:
tree_pred_val = tree.predict(new_dum_aligned)[0]
print(f"Predicted Collection (Linear Regression): {lr_pred_val:,.0f}")
print(f"Predicted Collection (Decision Tree): {tree_pred_val:,.0f}")
Training categories: {'3D_available': ['NO', 'YES'], 'Genre': ['Action', 'Comedy', 'Drama', 'Thriller']}
Predicted Collection (Linear Regression): -610,565,264
Predicted Collection (Decision Tree): 74,800
Optional: more on tree visualization https://mljar.com/blog/visualize-decision-tree/¶
Very Fancy Tree¶
In [ ]:
# For best results - install dtreeviz using Anaconda Navigator (which also installs the dependent packages) instead of pip below
In [ ]:
# pip install dtreeviz
In [ ]:
# pip install python-graphviz
In [ ]:
import dtreeviz
In [ ]:
# https://github.com/parrt/dtreeviz/blob/master/notebooks/dtreeviz_sklearn_visualisations.ipynb
viz = dtreeviz.model(regtreemodel2, x, y,target_name="sales", feature_names = ["TV","radio", "newspaper"])
In [ ]:
viz.view()
In [ ]:
viz.view(orientation="LR")
In [ ]:
viz.view(fancy=False)